In [1]:
import sqlite3
import pandas as pd
import time
import matplotlib.pylab as plt
import gensim.models.word2vec as word2vec
%matplotlib inline
In [11]:
brunch_db_path = '/Users/goodvc/Documents/data/sqllite/brunch_db.db'
In [10]:
def load_all_followings():
conn = sqlite3.connect(brunch_db_path)
sql = """
select * from following_tbl where writerid <> 'brunch';
"""
ds = pd.read_sql(sql, conn)
conn.close()
return ds
In [8]:
## corpus generation
## userid = sentence, word = writerid
from random import shuffle
def makeW2VCorpus(ds):
uid_grouped = ds[:].groupby('userid')
corpus = []
for idx, row in uid_grouped:
writers = row.writerid.tolist()
if len(writers) < 2:
continue
shuffle(writers)
corpus.append(writers)
return corpus
In [14]:
ds = load_all_followings()
corpus = makeW2VCorpus(ds)
b2v = word2vec.Word2Vec(corpus, window=20, size=100, min_count=4)
In [12]:
ds = load_all_followings()
corpus = makeW2VCorpus(ds)
In [165]:
b2v_2d = word2vec.Word2Vec(corpus, window=40, size=2, min_count=400)
In [171]:
b2v_3d = word2vec.Word2Vec(corpus, window=30, size=3, min_count=300)
In [133]:
b2v.syn0.shape
Out[133]:
In [175]:
import pandas as pd
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from matplotlib import rcParams
from datetime import datetime
import random
import seaborn as sns
#sns.set(style="ticks")
sns.set(style="white")
sns.set(style="darkgrid")
rcParams['font.family'] = 'NanumGothic'
rcParams.update({'font.size': 15})
def drawPlot(v, x, y):
v.plot(kind='scatter', x=x,y=y, figsize=(18,10), c='r', alpha=0.7)
for idx, pos in v.iterrows():
plt.text(pos[x], pos[y], idx, fontdict=dict(alpha=0.7) )
plt.xlabel('Word2Vec 2-dimensional trained by brunch following data', fontsize=15)
plt.ylabel('')
def draw3DPlot(v, x, y, z):
fig = plt.figure(figsize=(20,15))
threedee = fig.gca(projection='3d')
threedee.scatter(v[x], v[y],v[z] , c='r')
for idx, pos in v.iterrows():
threedee.text(pos[x], pos[y],pos[z], writer_info_dict[idx]['name'] , fontdict=dict(alpha=0.7) )
In [157]:
writer_info_dict['goodvc78']['name']
Out[157]:
In [176]:
v = pd.DataFrame(b2v_2d.syn0, index=b2v_2d.vocab)
v3 = pd.DataFrame(b2v_3d.syn0, index=b2v_3d.vocab)
draw3DPlot(v3, 0, 1, 2)
drawPlot(v, 0, 1)
In [73]:
ds.count()
Out[73]:
In [78]:
len(ds.userid.unique())
Out[78]:
In [144]:
def load_all_writer_info():
conn = sqlite3.connect(brunch_db_path)
sql = """
select * from writer_info_tbl where writerid <> 'brunch';
"""
ds = pd.read_sql(sql, conn)
conn.close()
return ds
In [145]:
def writer_info_ds2dict(ds):
info_dict = {}
cols = ['writerid','name','profile','imgsrc','documents','megazines','followers','followings']
for idx, row in ds.iterrows():
info_dict[row['writerid']] = { col:row[col] for col in cols }
return info_dict
In [146]:
writer_info_ds = load_all_writer_info()
writer_info_dict = writer_info_ds2dict(writer_info_ds)
In [3]:
home_dir = '/Users/goodvc/Data/brunch-recsys/resource'
In [41]:
## word2vec model save
w2v_model_path = '{home}/b2v.20160322.model'.format(home=home_dir)
w2v_model_path = '{home}/b2v.latest.model'.format(home=home_dir)
b2v.save(w2v_model_path)
In [21]:
## writer info dict save to pickle
import pickle
writer_info_path = '{home}/writer.pkl'.format(home=home_dir)
output = open(writer_info_path, 'wb')
pickle.dump(writer_info_dict, output, 2)
output.close()
In [5]:
b2v = word2vec.Word2Vec.load('{home}/b2v.20160322.model'.format(home=home_dir))
In [6]:
b2v.similarity('goodvc78','suyoung')
Out[6]:
In [23]:
def pretty_most_similar(positive=[], negative=[], topn=10):
nn = b2v.most_similar(positive=positive, negative=negative, topn=topn)
for (writer, index) in nn:
print("https://brunch.co.kr/@{writer}".format(writer=writer) , index)
return nn
In [26]:
pretty_most_similar(positive=['cojette'],topn=50)
Out[26]:
In [30]:
pretty_most_similar(positive=['cojette'],topn=50)
Out[30]:
In [31]:
b2v.syn0.shape
Out[31]:
In [32]:
goodvc78
paranmoja
In [33]:
from scipy.spatial import distance
#wv = b2v
#v1 = b2v['goodvc78']
def nestest(NX, v1):
#writers_vector = wv.syn0
dist = distance.cdist( NX, v1.reshape(1,len(v1)), 'cosine' )
nearest_idx = dist.argmin()
if (NX[nearest_idx] == v1).all() == True:
dist[nearest_idx] = 1
#nearest_idx = dist.argmin()
#nearest_name = wv.index2word[nearest_idx]
return nearest_idx
In [34]:
def a2z_writers(w2v, a, z, max_steps=100):
av = w2v[a]
zv = w2v[z]
NX = w2v.syn0
sv = (zv - av) / max_steps
exists = set([a,z])
writers = [a]
for n in range(0,max_steps):
nv = av+(sv*n)
idx = nestest(NX, nv)
name = w2v.index2word[idx]
if not name in exists :
#print( "https://brunch.co.kr/@{id}".format(id= w2v.index2word[idx]) )
writers.append(name)
exists.add(name)
writers.append(z)
return writers
writers = a2z_writers( b2v, 'goodvc78', 'paranmoja', 100)
for n in writers:
print( "https://brunch.co.kr/@{id}".format(id= n) )
In [339]:
(b2v.index2word[200], b2v.index2word[341])
Out[339]:
In [45]:
b2v.index2word[:100]
Out[45]:
In [400]:
import mykmeans as kmeans
b2v_vec = b2v.syn0
centres, index2cid, dist = kmeans.kmeanssample(b2v_vec, 20,
metric = 'cosine',
delta = 0.00000001,
nsample = 0, maxiter = 100,)
clustered_ds = pd.DataFrame( [ (a, b, c) for a, b, c in zip(b2v.index2word, index2cid, dist )],
columns=['wid', 'cid', 'dist'] ).sort(['cid','dist'], ascending=True)
writer2cid = { writer:cid for writer,cid in zip(b2v.index2word, index2cid) }
In [27]:
def writer_cluster(wid, verbose=False):
cid = writer2cid.get(wid,-1)
if cid == -1:
print('{} is not exist'.format(wid))
ds = clustered_ds[clustered_ds.cid==cid]
if verbose == True:
for idx, row in ds.iterrows():
print("https://brunch.co.kr/@{} {}".format(row.wid, row.dist))
return ds
def cid_cluster(cid, verbose=False):
ds = clustered_ds[clustered_ds.cid==cid]
if verbose == True:
for idx, row in ds.iterrows():
print("https://brunch.co.kr/@{} {}".format(row.wid, row.dist))
return ds
#writer_cluster('suyoung', True)
for c in range(0,20):
r = cid_cluster(c).iloc[0]
writers = a2z_writers( b2v, 'goodvc78', r.wid, 100)
print ( 'goodvc78','-->',r.wid )
for n in writers:
print( "https://brunch.co.kr/@{id}".format(id= n) )